package data_deepprocessing.algorithm.ssvm.ssvm_util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import data_deepprocessing.algorithm.crfs.CreateCRFsDataSet;
import data_deepprocessing.algorithm.ssvm.bean.WordMarkBean;
import data_deepprocessing.prepareData.beans.Node2vec_WordBookBean;
import data_deepprocessing.prepareData.beans.XianBingShi_new_zhangBean;
import data_deepprocessing.prepareData.db.InitSeedDB;
import data_deepprocessing.prepareData.db.Node2vec_WordBookDB;
import data_deepprocessing.prepareData.db.XianBingShi_New_zhangDB;
import data_deepprocessing.util.FilePathUtil;
import data_deepprocessing.util.FileUtil;

/**
 * @author 作者 : YUHU YUAN
 * @date 创建时间：2017年3月31日 下午10:27:31
 * @version 1.0 1:得到List<XianBingSHiBean> list； 2:得到分词后的结果； 3：通过word embedding
 *          的到vector和 model文件 4：利用list中的信息何vector和model文件生成 ssvm要用的配置文件
 *          5：制作训练集和测试集的数据 6：调用算法准备开始试验。 现在试验设计和实验方法部分就好写了吧，这部分的内容好好的组织一下
 * 
 *          注释：XBS均指XianBingShi
 */

public class SsvmService2CrossValidService2Node2vec2Word {

	private InitSeedDB initSeedDB;
	private XianBingShi_New_zhangDB xianBingShi_New_zhangDB;
	private Node2vec_WordBookDB node2vec_WordBookDB;
	private CreateCRFsDataSet createCRFsDataSet;

	/**
	 * 
	 * 以下是添加的步骤
	 */
	private List<Node2vec_WordBookBean> doGetWordBookBeans() {
		return node2vec_WordBookDB.selectAllWordBook2Word();
	}

	private static Map<Integer, String> Num_WordMap = new HashMap<>();

	private static Map<String, String> Word_VectorMap = new HashMap<>();

	private void doBuildWord_VectorMap() throws IOException {
		List<Node2vec_WordBookBean> node2vec_WordBookBeans = doGetWordBookBeans();
		for (Node2vec_WordBookBean bean : node2vec_WordBookBeans) {
			Num_WordMap.put(bean.getNum(), bean.getWord());
		}

		BufferedReader reader = FileUtil.getReader(FilePathUtil.node2vec_vectorPath2EN);
		String line = "";
		line = reader.readLine();// 第一行不是向量数据不要
		while ((line = reader.readLine()) != null) {
			String[] tempStrings = line.split(" ", 2);
			Word_VectorMap.put(Num_WordMap.get(Integer.valueOf(tempStrings[0])), tempStrings[1]);
		}
	}

	/**
	 * 
	 * 以上为添加的步骤
	 */

	// 这里先自己找一部分 然后再实验一下 十重交叉验证，还有现在到底能不能用还不能确定
	// 希望生成的数据集没有问题把，现在先全部的训练一遍
	private List<XianBingShi_new_zhangBean> doGetSegmentedXianBingShi() {
		return xianBingShi_New_zhangDB.selectALlXianBingShi();
	}

	private List<String> doGetSymptom_Dictionary() {
		return initSeedDB.selectAllSeedContent();
	}

	private void divideDataset() {
		List<XianBingShi_new_zhangBean> beanList = doGetSegmentedXianBingShi();
		while (!beanList.isEmpty()) {
			int size = beanList.size();
			int index = (int) (Math.random() * size);
			XianBingShi_new_zhangBean tmp = beanList.get(index);
			divided_dataset.get(size % 10).add(tmp);
			beanList.remove(index);
		}

	}

	private Map<Integer, List<XianBingShi_new_zhangBean>> divided_dataset = initMap();

	private Map<Integer, List<XianBingShi_new_zhangBean>> initMap() {
		Map<Integer, List<XianBingShi_new_zhangBean>> divided_dataset = new HashMap<>();
		for (int i = 0; i < 10; i++) {
			List<XianBingShi_new_zhangBean> fold = new ArrayList<XianBingShi_new_zhangBean>();
			divided_dataset.put(i, fold);
		}
		return divided_dataset;
	}

	public void doGenerateCRFCrossValidDataSet() {
		divideDataset();
		List<String> initSeeds = doGetSymptom_Dictionary();
		for (int i = 0; i < divided_dataset.size(); ++i) {
			StringBuffer testPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator)
					.append("test");

			List<XianBingShi_new_zhangBean> testDataSet = divided_dataset.get(i);
			try {
				createCRFsDataSet.doCreateCrfDataUpGradeThree2CRFNew(testPath.toString(), initSeeds, testDataSet, "S");
			} catch (Exception e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
			StringBuffer trainPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator)
					.append("train");
			List<XianBingShi_new_zhangBean> trainDataSet = new ArrayList<>();
			for (int j = 0; j < divided_dataset.size(); ++j) {
				if (j != i) {
					trainDataSet.addAll(divided_dataset.get(j));
				}
			}
			try {
				createCRFsDataSet.doCreateCrfDataUpGradeThree2CRFNew(trainPath.toString(), initSeeds, trainDataSet,
						"S");
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			testPath.setLength(0);
			trainPath.setLength(0);
		}

	}

	/**
	 * @author 作者 : YUHU YUAN @date 创建时间：2017年4月9日
	 *         下午10:37:03 @parameter @return @throws
	 */
	private void doGenerateSsvmDataSet2Train(String outputPath, String inputPath) throws IOException {
		BufferedWriter writer = FileUtil.getWriter(outputPath);
		File[] crfFiles = new File(inputPath).listFiles();
		List<WordMarkBean> wordMarkBeans = null;
		BufferedReader reader = null;
		int qid = 1;// SVM程序要求必须从1开始递增
		for (File file : crfFiles) {
			reader = FileUtil.getReader(file);
			wordMarkBeans = new ArrayList<>();
			String line = "";
			while ((line = reader.readLine()) != null) {
				String[] temp = line.split("\t");
				WordMarkBean bean = new WordMarkBean(temp[0], temp[1]);
				wordMarkBeans.add(bean);
			}

			StringBuffer constructTerm = new StringBuffer();
			for (WordMarkBean bean : wordMarkBeans) {
				String[] voctor = Word_VectorMap.get(bean.getWord()).split(" ");
				String mark = bean.getMark();
				String tag = "";
				if (mark.equals("O")) {
					tag = "1";
				} else if (mark.equals("B-S")) {
					tag = "2";
				} else if (mark.equals("E-S")) {
					tag = "3";
				}
				constructTerm.append(tag).append(" ");
				constructTerm.append("qid:").append(qid).append(" ");
				if (voctor != null) {
					for (int i = 0; i < voctor.length; ++i) {
						constructTerm.append((i + 1) + ":" + voctor[i]).append(" ");
					}
				}
				constructTerm.append("#").append(bean.getWord());
				writer.write(constructTerm.toString());
				writer.flush();
				writer.newLine();
				constructTerm.setLength(0);
			}
			++qid;
		}
		if (reader != null) {
			reader.close();
		}
		if (writer != null) {
			writer.close();
		}

	}

	private void doGenerateSsvmDataSet2Test(String outputPath, String outputPath2Judge, String inputPath)
			throws IOException {
		BufferedWriter writer = FileUtil.getWriter(outputPath);
		BufferedWriter writer2judge = FileUtil.getWriter(outputPath2Judge);
		StringBuffer constructTerm = new StringBuffer();
		StringBuffer constructTerm2judge = new StringBuffer();
		File[] crfFiles = new File(inputPath).listFiles();
		List<WordMarkBean> wordMarkBeans = null;
		BufferedReader reader = null;
		int qid = 1; // 这里主要是SVM程序要求必须从1开始，这样就必须自己设计
		for (File file : crfFiles) {
			reader = FileUtil.getReader(file);
			wordMarkBeans = new ArrayList<>();
			String line = "";
			while ((line = reader.readLine()) != null) {
				String[] temp = line.split("\t");
				WordMarkBean bean = new WordMarkBean(temp[0], temp[1]);
				wordMarkBeans.add(bean);
			}
			for (WordMarkBean bean : wordMarkBeans) {
				String[] voctor = Word_VectorMap.get(bean.getWord()).split(" ");
				String mark = bean.getMark();
				String tag = "";
				if (mark.equals("O")) {
					tag = "1";
				} else if (mark.equals("B-S")) {
					tag = "2";
				} else if (mark.equals("E-S")) {
					tag = "3";
				}
				constructTerm.append(tag).append(" ");
				constructTerm.append("qid:").append(qid).append(" ");
				constructTerm2judge.append(tag).append(" ");
				constructTerm2judge.append("qid:").append(file.getName()).append(" ").append(qid).append(" ");
				if (voctor != null) {

					for (int i = 0; i < voctor.length; ++i) {
						constructTerm.append((i + 1) + ":" + voctor[i]).append(" ");
					}
				}
				constructTerm.append("#").append(bean.getWord());
				constructTerm2judge.append("#").append(bean.getWord());
				writer.write(constructTerm.toString());
				writer.flush();
				writer.newLine();
				writer2judge.write(constructTerm2judge.toString());
				writer2judge.flush();
				writer2judge.newLine();
				constructTerm.setLength(0);
				constructTerm2judge.setLength(0);
			}
			++qid;
		}
		if (writer != null) {
			writer.close();
		}
		if (writer2judge != null) {
			writer2judge.close();
		}

	}

	private static String path = "D:\\yyh_yuanyuhu_graduation_experimental\\node2vec_SSVM\\dataset2word\\";
	//这里是它的Main方法
	public void doGenerateSsvmCrossValidDataSet() {
		try {
			doBuildWord_VectorMap();
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		//注释的内容是因为CRF的程序已经生成了，现在在它的基础上做的，如果要单拿出来做，直接就可以做了

		for (int i = 0; i < 10; ++i) {
			StringBuffer testPath = new StringBuffer(path).append("data").append(i).append(File.separator)
					.append("test").append(File.separator).append("test.txt");
			StringBuffer testPath2Judge = new StringBuffer(path).append("data").append(i).append(File.separator)
					.append("test").append(File.separator).append("test2Judge.txt");
			StringBuffer crfTestPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator).append("test");
			try {
				doGenerateSsvmDataSet2Test(testPath.toString(), testPath2Judge.toString(), crfTestPath.toString());
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			StringBuffer trainPath = new StringBuffer(path).append("data").append(i).append(File.separator)
					.append("train").append(File.separator).append("train.dat");
			StringBuffer crfTrainPath = new StringBuffer(path).append("dataCRF").append(i).append(File.separator).append("train");
			try {
				doGenerateSsvmDataSet2Train(trainPath.toString(), crfTrainPath.toString());
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			testPath.setLength(0);
			testPath2Judge.setLength(0);
			trainPath.setLength(0);
			crfTestPath.setLength(0);
			crfTrainPath.setLength(0);

		}
	}

	public InitSeedDB getInitSeedDB() {
		return initSeedDB;
	}

	public void setInitSeedDB(InitSeedDB initSeedDB) {
		this.initSeedDB = initSeedDB;
	}

	public XianBingShi_New_zhangDB getXianBingShi_New_zhangDB() {
		return xianBingShi_New_zhangDB;
	}

	public void setXianBingShi_New_zhangDB(XianBingShi_New_zhangDB xianBingShi_New_zhangDB) {
		this.xianBingShi_New_zhangDB = xianBingShi_New_zhangDB;
	}

	public Node2vec_WordBookDB getNode2vec_WordBookDB() {
		return node2vec_WordBookDB;
	}

	public void setNode2vec_WordBookDB(Node2vec_WordBookDB node2vec_WordBookDB) {
		this.node2vec_WordBookDB = node2vec_WordBookDB;
	}

	public CreateCRFsDataSet getCreateCRFsDataSet() {
		return createCRFsDataSet;
	}

	public void setCreateCRFsDataSet(CreateCRFsDataSet createCRFsDataSet) {
		this.createCRFsDataSet = createCRFsDataSet;
	}
	

}
